Homework 2

Question 1 _ Ex. 2.8 (Chapter 2, page 40)

In order to compare the classification performance of linear regression and k– nearest neighbor classification on the given zip.train and zip.test data, we first fit the linear regression model to the zip.train data, particularly for 2’s and 3’s. Then, we got the mean squared error(MSE) for the train and test sets of this linear fit as follows. We read the ziped data by importing files in the “Environment” tab.

library (ElemStatLearn)

train_2_3 <- zip.train[zip.train[,1]==2|zip.train[,1]==3, 1:257]
test_2_3 <- zip.test[zip.test[,1]==2 | zip.test[,1]==3, 1:257]

z_train23 <- zip.train[zip.train[,1]==2|zip.train[,1]==3,]
z_test23 <- zip.test[zip.test[,1]==2|zip.test[,1]==3,]

z_test23 = as.data.frame(z_test23)
colnames(z_test23)[1] <- "id"
train_2_3 = as.data.frame(train_2_3)
colnames(train_2_3)[1] <- "id"

Based on the following plots, we can see the zipped data set has the information of hand writings of numbers in it.

#--------------------------------------------------------------------- Images
# zip.train
par(mfrow=c(2,2))
for (i in 1:2) {
  im <- matrix(as.numeric(zip.train[i,2:257]), nrow = 16, ncol = 16)
  image(t(apply(-im,1,rev)),col=gray((0:32)/32))
  title(main="labels in zip.train")
}

# zip.train23
for (i in 1:2) {
  im <- matrix(as.numeric(train_2_3[i,2:257]), nrow = 16, ncol = 16)
  image(t(apply(-im,1,rev)),col=gray((0:32)/32))
  title(main="labels in  zip.train23")
}

Now we solve the linear model to obtain the beta coefficients(linear regression coefficients) as follows: (first column of the linear regression estimates the coefficients, second column is the standard deviation error of approximation, third column is the t value, fourth Pr(>|t|);

BetaHatFromTrainset = coef(summary(lm(train_2_3$id~.,data = train_2_3)))
summary(BetaHatFromTrainset)
##     Estimate           Std. Error           t value        
##  Min.   :-0.300281   Min.   :  0.01636   Min.   :-4.03479  
##  1st Qu.:-0.021111   1st Qu.:  0.02054   1st Qu.:-0.81638  
##  Median : 0.002046   Median :  0.02441   Median : 0.08648  
##  Mean   : 0.174506   Mean   :  0.84776   Mean   : 0.01085  
##  3rd Qu.: 0.023537   3rd Qu.:  0.03387   3rd Qu.: 0.84938  
##  Max.   :23.585843   Max.   :104.38305   Max.   : 2.65578  
##     Pr(>|t|)        
##  Min.   :0.0000583  
##  1st Qu.:0.1801009  
##  Median :0.4082943  
##  Mean   :0.4440814  
##  3rd Qu.:0.6919814  
##  Max.   :0.9945830
Betacoeffs = BetaHatFromTrainset[,1]
Betaintersept = Betacoeffs[1]
Betacoeffs = Betacoeffs[2:length(BetaHatFromTrainset[,1])]

When we train the model on the train set and we obtaine the linear regression coefficients, we can predict the Y values in the test set. In order to compute the accuracy of the model and validate our results, we computed the mean squared test set’s error and the mean squared train set’s error as follows:

options(warn = -1)
Yprediction <- c()
Function_Yprediction <- function(dataSet,Betacof,Bintercept){
  l = dim(dataSet)
  for (i in 1:l[1]) {
      Yprediction[i] = (Betacof %*% t(dataSet[i,]) )+ Bintercept
  }
  return(Yprediction)
}
YpredictionTrain = Function_Yprediction(train_2_3[,2:257],Betacoeffs,Betaintersept)
YpredictionTest = Function_Yprediction(test_2_3[,2:257],Betacoeffs,Betaintersept)
options(warn = -1)
#---------------------------------------------------------------------MSE train
OutOfSamplingErrorTEST = mean((YpredictionTest - z_test23[,1])^2)
print(0.1516653)
## [1] 0.1516653
#---------------------------------------------------------------------MSE test
OutOfSamplingErrorTRAiN = mean((YpredictionTrain - train_2_3[,1])^2)
print(OutOfSamplingErrorTRAiN)
## [1] 0.02481172

Based on the shown mean squared error for the test and train sets of linear regression model and knn model, knn worked better in general because its mean squared error is smaller than the other model. In addition, knn worked better with lower values of K for this specific data set.

library("class")

obs <- nrow(train_2_3)
obsTest <- nrow(test_2_3)

train.model <- knn(train=train_2_3, test=train_2_3, train_2_3[,1], k=1)
train.error <- sum(train.model != train_2_3[,1])/obs

test.model <- knn(train=train_2_3, test=test_2_3, cl=train_2_3[,1], k=1)
NN1_OutOfSamplingErrorTEST <- sum(test.model != test_2_3[,1])/obsTest
print(NN1_OutOfSamplingErrorTEST)
## [1] 0.02472527
#---------------------------------------------------------------------------------- KNN: k=3
train.model <- knn(train=train_2_3, test=train_2_3, train_2_3[,1], k=3)
train.error <- sum(train.model != train_2_3[,1])/obs

test.model <- knn(train=train_2_3, test=test_2_3, cl=train_2_3[,1], k=3)
NN3_OutOfSamplingErrorTEST <- sum(test.model != test_2_3[,1])/obsTest
print(NN3_OutOfSamplingErrorTEST)
## [1] 0.03021978
#---------------------------------------------------------------------------------- KNN: k=5
train.model <- knn(train=train_2_3, test=train_2_3, train_2_3[,1], k=5)
train.error <- sum(train.model != train_2_3[,1])/obs

test.model <- knn(train=train_2_3, test=test_2_3, cl=train_2_3[,1], k=5)
NN5_OutOfSamplingErrorTEST <- sum(test.model != test_2_3[,1])/obsTest
print(NN5_OutOfSamplingErrorTEST)
## [1] 0.03021978
#---------------------------------------------------------------------------------- KNN: k=7
train.model <- knn(train=train_2_3, test=train_2_3, train_2_3[,1], k=7)
train.error <- sum(train.model != train_2_3[,1])/obs

test.model <- knn(train=train_2_3, test=test_2_3, cl=train_2_3[,1], k=7)
NN7_OutOfSamplingErrorTEST <- sum(test.model != test_2_3[,1])/obsTest
print(NN7_OutOfSamplingErrorTEST)
## [1] 0.03021978
#---------------------------------------------------------------------------------- KNN: k=11
train.model <- knn(train=train_2_3, test=train_2_3, train_2_3[,1], k=11)
train.error <- sum(train.model != train_2_3[,1])/obs

test.model <- knn(train=train_2_3, test=test_2_3, cl=train_2_3[,1], k=11)
NN11_OutOfSamplingErrorTEST <- sum(test.model != test_2_3[,1])/obsTest
print(NN11_OutOfSamplingErrorTEST)
## [1] 0.03571429
#---------------------------------------------------------------------------------- KNN: k=13
train.model <- knn(train=train_2_3, test=train_2_3, train_2_3[,1], k=13)
train.error <- sum(train.model != train_2_3[,1])/obs

test.model <- knn(train=train_2_3, test=test_2_3, cl=train_2_3[,1], k=13)
NN13_OutOfSamplingErrorTEST <- sum(test.model != test_2_3[,1])/obsTest
print(NN13_OutOfSamplingErrorTEST)
## [1] 0.03296703
#---------------------------------------------------------------------------------- KNN: k=15
train.model <- knn(train=train_2_3, test=train_2_3, train_2_3[,1], k=15)
train.error <- sum(train.model != train_2_3[,1])/obs

test.model <- knn(train=train_2_3, test=test_2_3, cl=train_2_3[,1], k=15)
NN15_OutOfSamplingErrorTEST <- sum(test.model != test_2_3[,1])/obsTest
print(NN15_OutOfSamplingErrorTEST)
## [1] 0.03846154

We show the relation of the KNN model’s error and the number of K(neighborhoods) in the following plot.

K_Errors<-c()
K_Errors[1] = NN1_OutOfSamplingErrorTEST; K_Errors[2] = NN3_OutOfSamplingErrorTEST; 
K_Errors[3] = NN5_OutOfSamplingErrorTEST; K_Errors[4] = NN7_OutOfSamplingErrorTEST; 
K_Errors[5] = NN11_OutOfSamplingErrorTEST;K_Errors[6] = NN13_OutOfSamplingErrorTEST; 
K_Errors[7] = NN15_OutOfSamplingErrorTEST; 

k<-c(1,3,5,7,11,13,15)
plot(K_Errors~k , type = 'p', col="dark red", pch=15, axes=T)
axis(1, at=1:16)
axis(2)
title(main = "OutOfSamplingError of test set - KNN")


## Problem 2

library(ISLR)
AutoData <- data.frame(Auto)
names(AutoData)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"  
## [5] "weight"       "acceleration" "year"         "origin"      
## [9] "name"

Part a: The following scatter plot shows all possible plotting permutations of the features in the Auto data set.

pairs(AutoData,col = "dark blue")

graphics.off(); par("mar"); par(mar=c(1,1,1,1));
## [1] 5.1 4.1 4.1 2.1

However, the following plots have more interesting meanings since the features which are more correlated conceptually are plotted.

# par(mfrow=c(2,2))
plot(AutoData[,1],AutoData[,3], main="Scatterplot Example", xlab="mpg", ylab="displacement", col = "dark red",pch=14)

plot(AutoData[,5],AutoData[,6], main="Scatterplot Example", xlab="weight", ylab="acceleration",col = "blue", pch=19)

plot(AutoData[,7],AutoData[,6], main="Scatterplot Example", xlab="year", ylab="acceleration", col = "dark blue", pch=19)

plot(AutoData[,4],AutoData[,6], main="Scatterplot Example", xlab="horsepower", ylab="acceleration", col = "green", pch=19)

# par(mfrow=c(2,2))
plot(AutoData[,8],AutoData[,3], main="Scatterplot Example", xlab="origin", ylab="displacement", col = "dark green", pch=19)

plot(AutoData[,4],AutoData[,7], main="Scatterplot Example", xlab="horsepower", ylab="year", col = "cyan", pch=19)

plot(AutoData[,3],AutoData[,4], main="Scatterplot Example", xlab="displacement", ylab="horsepower", col = "black", pch=19)

plot(AutoData[,5],AutoData[,4], main="Scatterplot Example", xlab="weight", ylab="horsepower", col = "yellow", pch=19)


Part b:

l = dim(AutoData)
Quantdata <- data.frame(AutoData[,1:(l[2]-1)])

# par(mfrow=c(4,2))
dimension = dim(Quantdata);  CorrelationValues <- c(); 
CorMx <- matrix(0, ncol = dimension[2], nrow =dimension); Features = c(1:dimension[2])
for (i in 1:dimension[2]){
  for (j in 1:dimension[2]){
    CorrelationValues[j] = cor(Quantdata[,i],Quantdata[,j])
  }
  CorMx[i,] = CorrelationValues;
  # plot(CorrelationValues ~ Features, type = 'p', col="blue", pch=15, axes=T)
  # axis(1, at=1:dimension[2], labels = names(Quantdata), las=2)
  # axis(2)
  # title(main = "Features correlation")
}
CorMx <- data.frame(CorMx[1:8,1:8])
print(CorMx)
##           X1         X2         X3         X4         X5         X6
## 1  1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442  0.4233285
## 2 -0.7776175  1.0000000  0.9508233  0.8429834  0.8975273 -0.5046834
## 3 -0.8051269  0.9508233  1.0000000  0.8972570  0.9329944 -0.5438005
## 4 -0.7784268  0.8429834  0.8972570  1.0000000  0.8645377 -0.6891955
## 5 -0.8322442  0.8975273  0.9329944  0.8645377  1.0000000 -0.4168392
## 6  0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392  1.0000000
## 7  0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199  0.2903161
## 8  0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054  0.2127458
##           X7         X8
## 1  0.5805410  0.5652088
## 2 -0.3456474 -0.5689316
## 3 -0.3698552 -0.6145351
## 4 -0.4163615 -0.4551715
## 5 -0.3091199 -0.5850054
## 6  0.2903161  0.2127458
## 7  1.0000000  0.1815277
## 8  0.1815277  1.0000000
# row.names(CorMx) <-names(Quantdata)
# colnames(CorMx) <- names(Quantdata)
# write.csv(CorMx, "Question9b_CorrelationMx.csv")


Part c: Coefficients show that how much each feature has impact on the prediction model. For instance, the most prominent factor(predictor/feature) that influences on the output model for mpg is the “year”. On the other hand, mpg is not very related to the feature “weight” because its linear regression coefficient is very small. Since the model is linear, the X and Y have a linear relation. The intercept represent the part of the model that is not relavant to the data set (X) based on the linear regression model.

  MyLinRegression <- lm(Quantdata$mpg~.,data = Quantdata[2:dimension[2]])
  summary(MyLinRegression)
## 
## Call:
## lm(formula = Quantdata$mpg ~ ., data = Quantdata[2:dimension[2]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16
  print(MyLinRegression[1]) 
## $coefficients
##   (Intercept)     cylinders  displacement    horsepower        weight 
## -17.218434622  -0.493376319   0.019895644  -0.016951144  -0.006474043 
##  acceleration          year        origin 
##   0.080575838   0.750772678   1.426140495


Part d: 1. The first graph shows that there is a non-linear relationship between the responce and the predictors;
2. The second graph shows that the residuals are normally distributed and right skewed.
3.The third graph shows that the constant variance of error assumption is not true for this model.
4. The Third graphs shows that there are no leverage points. However, there on observation that stands out as a potential leverage point.

par(mfrow=c(2,2))
plot(MyLinRegression)

Part e: The last model is the only one with all variables being significant. The R-squared statistics estimates that 87% of the changes in the response can be explained by this particular set of predictors ( single and interaction.)

model = lm(mpg ~.+displacement:cylinders +displacement:weight +acceleration:horsepower 
           +acceleration:year +acceleration:horsepower +weight:year, data = Quantdata)
model[1]
## $coefficients
##             (Intercept)               cylinders            displacement 
##            2.271566e+01            9.149696e-01           -6.290367e-02 
##              horsepower                  weight            acceleration 
##            7.440814e-03            5.598439e-03           -4.493608e+00 
##                    year                  origin  cylinders:displacement 
##            3.123735e-01            4.293904e-01           -2.128786e-03 
##     displacement:weight horsepower:acceleration       acceleration:year 
##            2.034438e-05           -3.809537e-03            6.346235e-02 
##             weight:year 
##           -1.958574e-04
summary(model)
## 
## Call:
## lm(formula = mpg ~ . + displacement:cylinders + displacement:weight + 
##     acceleration:horsepower + acceleration:year + acceleration:horsepower + 
##     weight:year, data = Quantdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1690 -1.5219  0.0553  1.3393 11.8468 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.272e+01  3.107e+01   0.731 0.465226    
## cylinders                9.150e-01  6.024e-01   1.519 0.129656    
## displacement            -6.290e-02  1.297e-02  -4.850 1.81e-06 ***
## horsepower               7.441e-03  2.697e-02   0.276 0.782758    
## weight                   5.598e-03  5.352e-03   1.046 0.296235    
## acceleration            -4.494e+00  1.439e+00  -3.122 0.001931 ** 
## year                     3.124e-01  3.901e-01   0.801 0.423834    
## origin                   4.294e-01  2.492e-01   1.723 0.085655 .  
## cylinders:displacement  -2.129e-03  2.703e-03  -0.788 0.431476    
## displacement:weight      2.034e-05  3.895e-06   5.224 2.90e-07 ***
## horsepower:acceleration -3.810e-03  1.872e-03  -2.035 0.042563 *  
## acceleration:year        6.346e-02  1.789e-02   3.548 0.000436 ***
## weight:year             -1.959e-04  6.534e-05  -2.997 0.002903 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.785 on 379 degrees of freedom
## Multiple R-squared:  0.8766, Adjusted R-squared:  0.8727 
## F-statistic: 224.3 on 12 and 379 DF,  p-value: < 2.2e-16
model = lm(mpg ~.+displacement*cylinders +displacement*weight +acceleration:horsepower 
           +acceleration:year +acceleration:horsepower +weight:year, data = Quantdata)
model[1]
## $coefficients
##             (Intercept)               cylinders            displacement 
##            2.271566e+01            9.149696e-01           -6.290367e-02 
##              horsepower                  weight            acceleration 
##            7.440814e-03            5.598439e-03           -4.493608e+00 
##                    year                  origin  cylinders:displacement 
##            3.123735e-01            4.293904e-01           -2.128786e-03 
##     displacement:weight horsepower:acceleration       acceleration:year 
##            2.034438e-05           -3.809537e-03            6.346235e-02 
##             weight:year 
##           -1.958574e-04
# We have fitted the the multiple linear regression model to the mpg feature. 
# the significance of the coefficients is shown by the stars as follows:
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Residual standard error: 2.785 on 379 degrees of freedom
# Multiple R-squared:  0.8766,  Adjusted R-squared:  0.8727 
# F-statistic: 224.3 on 12 and 379 DF,  p-value: < 2.2e-16

model = lm(mpg ~.-name-cylinders-acceleration+year:origin+displacement:weight+
            displacement:weight+acceleration:horsepower+acceleration:weight, data=Auto)
summary(model)
## 
## Call:
## lm(formula = mpg ~ . - name - cylinders - acceleration + year:origin + 
##     displacement:weight + displacement:weight + acceleration:horsepower + 
##     acceleration:weight, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5074 -1.6324  0.0599  1.4577 12.7376 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.868e+01  7.796e+00   2.396 0.017051 *  
## displacement            -7.794e-02  9.026e-03  -8.636  < 2e-16 ***
## horsepower               8.719e-02  3.167e-02   2.753 0.006183 ** 
## weight                  -1.350e-02  1.287e-03 -10.490  < 2e-16 ***
## year                     4.911e-01  9.825e-02   4.998 8.83e-07 ***
## origin                  -1.262e+01  4.109e+00  -3.071 0.002288 ** 
## year:origin              1.686e-01  5.277e-02   3.195 0.001516 ** 
## displacement:weight      2.253e-05  2.184e-06  10.312  < 2e-16 ***
## horsepower:acceleration -9.164e-03  2.222e-03  -4.125 4.56e-05 ***
## weight:acceleration      2.784e-04  7.087e-05   3.929 0.000101 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.861 on 382 degrees of freedom
## Multiple R-squared:  0.8687, Adjusted R-squared:  0.8656 
## F-statistic: 280.8 on 9 and 382 DF,  p-value: < 2.2e-16
model[1]
## $coefficients
##             (Intercept)            displacement              horsepower 
##            1.868032e+01           -7.794412e-02            8.718910e-02 
##                  weight                    year                  origin 
##           -1.350042e-02            4.910713e-01           -1.261881e+01 
##             year:origin     displacement:weight horsepower:acceleration 
##            1.685863e-01            2.252590e-05           -9.163626e-03 
##     weight:acceleration 
##            2.784229e-04


Part f: In the following we got the log(mpg), and we fit a linear regression model to the Auto data.

logY <- lm(log(Quantdata$mpg)~.,data = Quantdata[2:dimension[2]])
# summary(logY)
logYError= print(norm(as.numeric(residuals(logY)),type="2"))
## [1] 2.333954
logX <- lm(Quantdata$mpg~.,data = log(Quantdata[2:dimension[2]]))
# summary(logX)
logXError = print(norm(as.numeric(residuals(logX)),type="2"))
## [1] 60.13133
SqrtX <- lm(Quantdata$mpg~.,data = sqrt(Quantdata[2:dimension[2]]))
# summary(SqrtX)
SqrtXError = print(norm(as.numeric(residuals(SqrtX)),type="2"))
## [1] 62.90918
par(mfrow=c(3,4))
Power2X <- lm(Quantdata$mpg~.,data = (Quantdata[2:dimension[2]])^2)
# summary(Power2X)
Power2XError = print(norm(as.numeric(residuals(Power2X)),type="2"))
## [1] 69.35275
plot(logX,main = "log(X) diagnosis")
plot(SqrtX,main = "Sqrt(X) diagnosis")
plot(Power2X, main = "(X)^2 diagnosis")

Problem 3


This problem involves the “Boston” data set, which we saw in the lab for this chapter. We will now try to predict per capita crime rate using the other variables in this data set. In other words, per capita crime rate is the response, and the other variables are the predictors.

Part a: For each predictor, fit a simple linear regression model to predict the response. Describe your results. In which of the models is there a statistically significant association between the predictor and the response ? Create some plots to back up your assertions.

First, the Boston data is loaded as a data frame:

library(kableExtra)
library(MASS)
library(ggplot2)
library(jtools)
library(corrplot)
## corrplot 0.84 loaded
library(ggpubr)
## Loading required package: magrittr
library(knitr)
BostonData <- data.frame(Boston)
colnames(BostonData) <- c('crim','zn','indus','chas','nox','rm','age',
                          'dis','rad','tax','ptratio','black','lstat','medv')

Then, a simple regression model is fitted between the crime levels (estimate) and all the predictors in the data set.

# Get simple linear fit for all predictors: 
attach(BostonData)
#Zn = proportion of residential land zoned for lots over 25,000 sq.ft.
lm.fitZn = lm(crim~zn,data = BostonData)
summ(lm.fitZn)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 21.10
0.04
Adj. R² 0.04
Est. S.E. t val. p
(Intercept) 4.45 0.42 10.67 0.00
zn -0.07 0.02 -4.59 0.00
Standard errors: OLS
#indus = proportion of non-retail business acres per town.
lm.fitIndus = lm(crim~indus,data = BostonData)
summ(lm.fitIndus)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 99.82
0.17
Adj. R² 0.16
Est. S.E. t val. p
(Intercept) -2.06 0.67 -3.09 0.00
indus 0.51 0.05 9.99 0.00
Standard errors: OLS
#chas = Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
lm.fitChas = lm(crim~chas,data = BostonData)
summ(lm.fitChas)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 1.58
0.00
Adj. R² 0.00
Est. S.E. t val. p
(Intercept) 3.74 0.40 9.45 0.00
chas -1.89 1.51 -1.26 0.21
Standard errors: OLS
#nox = nitrogen oxides concentration (parts per 10 million).
lm.fitNox = lm(crim~nox,data = BostonData)
summ(lm.fitNox)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 108.56
0.18
Adj. R² 0.18
Est. S.E. t val. p
(Intercept) -13.72 1.70 -8.07 0.00
nox 31.25 3.00 10.42 0.00
Standard errors: OLS
#rm = average number of rooms per dwelling.
lm.fitRm = lm(crim~rm,data = BostonData)
summ(lm.fitRm)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 25.45
0.05
Adj. R² 0.05
Est. S.E. t val. p
(Intercept) 20.48 3.36 6.09 0.00
rm -2.68 0.53 -5.04 0.00
Standard errors: OLS
#age = proportion of owner-occupied units built prior to 1940.
lm.fitAge = lm(crim~age,data = BostonData)
summ(lm.fitAge)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 71.62
0.12
Adj. R² 0.12
Est. S.E. t val. p
(Intercept) -3.78 0.94 -4.00 0.00
age 0.11 0.01 8.46 0.00
Standard errors: OLS
#dis = weighted mean of distances to five Boston employment centres
lm.fitDis = lm(crim~dis,data = BostonData)
summ(lm.fitDis)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 84.89
0.14
Adj. R² 0.14
Est. S.E. t val. p
(Intercept) 9.50 0.73 13.01 0.00
dis -1.55 0.17 -9.21 0.00
Standard errors: OLS
#rad = index of accessibility to radial highways.
lm.fitRad = lm(crim~rad,data = BostonData)
summ(lm.fitRad)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 323.94
0.39
Adj. R² 0.39
Est. S.E. t val. p
(Intercept) -2.29 0.44 -5.16 0.00
rad 0.62 0.03 18.00 0.00
Standard errors: OLS
#tax = full-value property-tax rate per \$10,000.
lm.fitTax = lm(crim~tax,data = BostonData)
summ(lm.fitTax)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 259.19
0.34
Adj. R² 0.34
Est. S.E. t val. p
(Intercept) -8.53 0.82 -10.45 0.00
tax 0.03 0.00 16.10 0.00
Standard errors: OLS
#ptratio = pupil-teacher ratio by town.
lm.fitPtratio = lm(crim~ptratio,data = BostonData)
summ(lm.fitPtratio)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 46.26
0.08
Adj. R² 0.08
Est. S.E. t val. p
(Intercept) -17.65 3.15 -5.61 0.00
ptratio 1.15 0.17 6.80 0.00
Standard errors: OLS
#black = 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
lm.fitBlack = lm(crim~black,data = BostonData)
summ(lm.fitBlack)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 87.74
0.15
Adj. R² 0.15
Est. S.E. t val. p
(Intercept) 16.55 1.43 11.61 0.00
black -0.04 0.00 -9.37 0.00
Standard errors: OLS
#lstat = lower status of the population (percent).
lm.fitLstat = lm(crim~lstat,data = BostonData)
summ(lm.fitLstat)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 132.04
0.21
Adj. R² 0.21
Est. S.E. t val. p
(Intercept) -3.33 0.69 -4.80 0.00
lstat 0.55 0.05 11.49 0.00
Standard errors: OLS
#medv = median value of owner-occupied homes in \$1000s.
lm.fitMedv = lm(crim~medv,data = BostonData)
summ(lm.fitMedv)
Observations 506
Dependent variable crim
Type OLS linear regression
F(1,504) 89.49
0.15
Adj. R² 0.15
Est. S.E. t val. p
(Intercept) 11.80 0.93 12.63 0.00
medv -0.36 0.04 -9.46 0.00
Standard errors: OLS

If we consider p<0.05 as statistically significant, then all predictors are significant, except for chas. In other words, the null hypothesis for chas cannot be rejected: \(H_0: \beta_{chas} = 0\). #####To compare the regression between a significant predictor(independent variable) such as nox and an insignificant variable, such as chas, one can plot their regression line as shown below.

The \(R^2\) values were summarized in a table to show goodness of fit for each linear model.The best fit lines are given by nox,lstat,rad and indus.

kable(round(rSqVals,3),caption = 'R squared values for Univariate Linear Regression',format="markdown")
zn indus chas nox rm age dis rad tax ptratio black lstat medv
0.04 0.165 0.003 0.177 0.048 0.124 0.144 0.391 0.34 0.084 0.148 0.208 0.151

Part b: Fit a multiple regression model to predict the response using all of the predictors. Describe your results. For which predictors can we reject the null hypothesis \(H_0:\beta_j = 0\) ?

In this case the model used is \(Y = \beta_0 + \beta_1X_1+...+\beta_{13}X_{13}\), where we interpret \(\beta_1\) as the average effect on Y of a one unit increase in \(X_1\) while holding all other predictors fixed.

#Multiple regression
lm.MRegfit = lm(crim ~.,data = BostonData)
summ(lm.MRegfit)
Observations 506
Dependent variable crim
Type OLS linear regression
F(13,492) 31.47
0.45
Adj. R² 0.44
Est. S.E. t val. p
(Intercept) 17.03 7.23 2.35 0.02
zn 0.04 0.02 2.39 0.02
indus -0.06 0.08 -0.77 0.44
chas -0.75 1.18 -0.63 0.53
nox -10.31 5.28 -1.95 0.05
rm 0.43 0.61 0.70 0.48
age 0.00 0.02 0.08 0.94
dis -0.99 0.28 -3.50 0.00
rad 0.59 0.09 6.68 0.00
tax -0.00 0.01 -0.73 0.46
ptratio -0.27 0.19 -1.45 0.15
black -0.01 0.00 -2.05 0.04
lstat 0.13 0.08 1.67 0.10
medv -0.20 0.06 -3.29 0.00
Standard errors: OLS

Since the p-values for “zn”, “dis”, “rad”, “black” and “medv” are below 0.05, then the null hypothesis can be rejected for these predictors.It is important to notice that while “indus”,“nox”,“rm”,“age”,“tax”,“ptratio” and “lstat” were considered significant in the univariate regression, in multiple regression they are no longer significant due to the fact that they are correlated with some of the significant predictors. The difference is that in univariate regresstion the coefficients represent the average increase in crime due to one predictor, ignoring all other predictors, while multiple regression accounts for all other predictions as well(while keeping them fixed).

A correlation matrix was computed to further explore this concept. Variables such as “indus”,“rm”,“age”,“tax”,“ptratio”,“lstat” are correlated to significant predictors, thus they do not directly contribute to the dependent variable, but they are surrogates for variables such as “medv”, “zn”, “dis”,“rad”.

# Correlation matrix
correlMx<-cor(BostonData)
corrplot(correlMx, method ="circle")

For example, variable “dis” (weighted mean of distances to five Boston employment centres) is strongly correlated to “indus” (-0.7) and “age” (-0.7), implying that the proportion of non-retail business acres per town (indus) and the proportion of owner-occupied units built prior to 1940 (age) is typically lower in areas where the distances to Boston employment (dis) is higher, but they do not necessarily affect the crime rate (dependent variable). While higher “indus” values are associated with higher “crime” values, “indus”" does not actually affect crime, implying that it is simply taking credit for the effect of “dis”" on crime.

Part c: How do your results from (a) compare to your results from (b) ? Create a plot displaying the univariate regression coefficients from (a) on the x-axis, and the multiple regression coefficients from (b) on the y-axis.

The comparison is explained in (b), where the effect of correlation between some of the predictors changes the level of significance of some of the independent variables in predicting the dependent variable.

coeffsLReg<- vector("numeric",0)
#Concatenate all the linear coeffs 
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitZn))["zn","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitIndus))["indus","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitChas))["chas","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitNox))["nox","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitRm))["rm","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitAge))["age","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitDis))["dis","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitRad))["rad","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitTax))["tax","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitPtratio))["ptratio","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitBlack))["black","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitLstat))["lstat","Estimate"])
coeffsLReg <-c(coeffsLReg, coef(summary(lm.fitMedv))["medv","Estimate"])
# Concatenate all the multiple regression coeffs
coeffsMReg<- coefficients(lm.MRegfit)
coeffsMReg<-coeffsMReg[-1]

Variable nox seems to be an outlier, due to its large (~30) univariate coefficient. To further inspect the difference between the coefficients from these two methods, the absolute value of the difference between the univariate and multivariate regression were plotted as well; note that the “nox” coefficient was removed since its difference was the largest.

Since indus, rm, ptratio and nox are not significant in the multiregression case, but significant in the univariate case, it makes sense that their coefficients are the most different.

Part d: Is there evidence of non-linear association between any of the predictors and the response ? A model of the form \(Y = \beta_0 + \beta_1X+ \beta_X^2 +\beta_3X^3 +\eta\) was constructed for each predictor to show if additional coefficients create a better fit.

polyfitZn<-lm(crim~poly(zn,3),data = BostonData)
polyfitIndus<-lm(crim~poly(indus,3),data = BostonData)
polyfitNox<-lm(crim~poly(nox,3),data = BostonData)
polyfitRm<-lm(crim~poly(rm,3),data = BostonData)
polyfitAge<-lm(crim~poly(age,3),data = BostonData)
polyfitDis<-lm(crim~poly(dis,3),data = BostonData)
polyfitRad<-lm(crim~poly(rad,3),data = BostonData)
polyfitTax<-lm(crim~poly(tax,3),data = BostonData)
polyfitPtratio<-lm(crim~poly(ptratio,3),data = BostonData)
polyfitBlack<-lm(crim~poly(black,3),data = BostonData)
polyfitLstat<-lm(crim~poly(lstat,3),data = BostonData)
polyfitMedv<-lm(crim~poly(medv,3),data = BostonData)
#summ(polyfit) function was used to inspect the R^2 and p values for each fit

The p value for each coefficient in the fit is inspected to assess whether they are significant (p<0.05). If the quadratic/cubic coefficient has a significant value, then the nonlinear model may be justified for that predictor. In this case “indus”, “nox”, “age”, “dis”, “ptratio” and “medv” seem to justify the cubic coefficient since their p value is below 0.05, thus they reject Null hypothesis.

knitr::kable(pVals, caption = "P-values table", floating.environment="sidewaystable",format="markdown")
zn indus nox rm age dis rad tax ptratio black lstat medv
beta0 0.0000000 0.0000000 0.00e+00 0.0000000 0.0000000 0 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0
beta1 0.0000047 0.0000000 0.00e+00 0.0000005 0.0000000 0 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0
beta2 0.0044205 0.0010861 7.74e-05 0.0015085 0.0000023 0 0.0091206 0.0000037 0.0024055 0.4566044 0.0378042 0
beta3 0.2295386 0.0000000 0.00e+00 0.5085751 0.0066799 0 0.4823138 0.2438507 0.0063005 0.5436172 0.1298906 0

Another method to check for nonlinearity is to assess the prediction versus residual trend; a non-random pattern usually suggests that a simple linear model may not be sufficient, thus a nonlinear trend may be better. A residual versus prediction plot was constructed for the univariate regression of “indus”, “nox”, “dis” and “medv” as their p-values for the cubic coefficient was near zero, implying that they should follow a nonlinear fit.